import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.metrics import (
mean_absolute_error,
mean_squared_error,
r2_score
)
import optuna
import pickle
file_path_train = "./Bangalore_house_prices/train.csv"
file_path_test = "./Bangalore_house_prices/test.csv"
df_train = pd.read_csv(file_path_train)
df_test = pd.read_csv(file_path_test)
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4137 entries, 0 to 4136 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 location 4137 non-null object 1 total_sqft 4137 non-null float64 2 bath 4137 non-null int64 3 price 4137 non-null float64 4 bhk 4137 non-null int64 dtypes: float64(2), int64(2), object(1) memory usage: 161.7+ KB
# Getting target value from df_train and df_test
X_train = df_train.copy()
y_train = X_train.pop("price")
X_test = df_test.copy()
y_test = X_test.pop("price")
X_train.head()
| location | total_sqft | bath | bhk | |
|---|---|---|---|---|
| 0 | 1ST PHASE JP NAGAR | 1875.0 | 3 | 3 |
| 1 | 1ST PHASE JP NAGAR | 1590.0 | 3 | 3 |
| 2 | 1ST PHASE JP NAGAR | 1566.0 | 2 | 2 |
| 3 | 1ST PHASE JP NAGAR | 2065.0 | 4 | 3 |
| 4 | 1ST PHASE JP NAGAR | 1394.0 | 2 | 2 |
y_train.head()
0 167.0 1 131.0 2 180.0 3 210.0 4 85.0 Name: price, dtype: float64
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
# Getting numerical and categorical columns
numerical_cols = X_train.select_dtypes(exclude=["object"]).columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
# For eval test in XGBoost
bundle = Pipeline(steps=[
('preprocessor', preprocessor)
])
bundle.fit(X_train)
eval_test = bundle.transform(X_test)
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
# Bundle preprocessing and modeling code in a pipeline
lr_pipe = Pipeline(steps=[
('preprocessor', preprocessor),
('model', lr_model)
])
lr_pipe.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])),
('model', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])),
('model', LinearRegression())])ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])Index(['total_sqft', 'bath', 'bhk'], dtype='object')
SimpleImputer(strategy='constant')
Index(['location'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore', sparse=False)
LinearRegression()
pd.Series({
"Train Score": lr_pipe.score(X_train, y_train),
"Test Score": lr_pipe.score(X_test, y_test)
})
Train Score 0.864645 Test Score 0.790460 dtype: float64
lr_cv_score = cross_val_score(lr_pipe, X_train, y_train, cv=cv)
print(
"Cross Validation Score \t\t", lr_cv_score, "\n",
"Cross Validation Mean Score \t", lr_cv_score.mean(), sep=""
)
Cross Validation Score [0.83154931 0.88158449 0.86728667 0.78621933 0.86571685] Cross Validation Mean Score 0.8464713287043215
from sklearn.tree import DecisionTreeRegressor
def objective(trial):
dt_params = dict(
criterion=trial.suggest_categorical('criterion', ["squared_error", "friedman_mse"]),
splitter=trial.suggest_categorical('splitter', ["best", "random"]),
max_leaf_nodes=trial.suggest_int("max_leaf_nodes", 100, 3000)
)
dt = Pipeline(steps=[
('preprocessor', preprocessor),
('model', DecisionTreeRegressor(**dt_params))
])
return cross_val_score(dt, X_train, y_train, cv=cv).mean()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
dt_params = study.best_params
[I 2022-06-11 21:57:34,951] A new study created in memory with name: no-name-d7614cfa-81bb-4fff-95e9-79c699f12eac [I 2022-06-11 21:57:36,417] Trial 0 finished with value: 0.7968530196240977 and parameters: {'criterion': 'friedman_mse', 'splitter': 'best', 'max_leaf_nodes': 2075}. Best is trial 0 with value: 0.7968530196240977. [I 2022-06-11 21:57:38,080] Trial 1 finished with value: 0.7678717514895723 and parameters: {'criterion': 'squared_error', 'splitter': 'random', 'max_leaf_nodes': 2912}. Best is trial 0 with value: 0.7968530196240977. [I 2022-06-11 21:57:39,767] Trial 2 finished with value: 0.7926360459711335 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 2834}. Best is trial 0 with value: 0.7968530196240977. [I 2022-06-11 21:57:40,630] Trial 3 finished with value: 0.7818653309434953 and parameters: {'criterion': 'friedman_mse', 'splitter': 'random', 'max_leaf_nodes': 434}. Best is trial 0 with value: 0.7968530196240977. [I 2022-06-11 21:57:42,053] Trial 4 finished with value: 0.8026220034138282 and parameters: {'criterion': 'friedman_mse', 'splitter': 'best', 'max_leaf_nodes': 1177}. Best is trial 4 with value: 0.8026220034138282. [I 2022-06-11 21:57:43,311] Trial 5 finished with value: 0.7879497128602452 and parameters: {'criterion': 'squared_error', 'splitter': 'random', 'max_leaf_nodes': 2740}. Best is trial 4 with value: 0.8026220034138282. [I 2022-06-11 21:57:44,588] Trial 6 finished with value: 0.7887850124445721 and parameters: {'criterion': 'squared_error', 'splitter': 'random', 'max_leaf_nodes': 2765}. Best is trial 4 with value: 0.8026220034138282. [I 2022-06-11 21:57:45,762] Trial 7 finished with value: 0.8045119069173114 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 663}. Best is trial 7 with value: 0.8045119069173114. [I 2022-06-11 21:57:47,138] Trial 8 finished with value: 0.790289145586975 and parameters: {'criterion': 'friedman_mse', 'splitter': 'best', 'max_leaf_nodes': 1061}. Best is trial 7 with value: 0.8045119069173114. [I 2022-06-11 21:57:48,111] Trial 9 finished with value: 0.7909316891334115 and parameters: {'criterion': 'friedman_mse', 'splitter': 'best', 'max_leaf_nodes': 392}. Best is trial 7 with value: 0.8045119069173114. [I 2022-06-11 21:57:49,737] Trial 10 finished with value: 0.8036756154291794 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 1867}. Best is trial 7 with value: 0.8045119069173114. [I 2022-06-11 21:57:51,292] Trial 11 finished with value: 0.8040953689108153 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 1871}. Best is trial 7 with value: 0.8045119069173114. [I 2022-06-11 21:57:52,744] Trial 12 finished with value: 0.7988572065627937 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 1355}. Best is trial 7 with value: 0.8045119069173114. [I 2022-06-11 21:57:54,081] Trial 13 finished with value: 0.8050947771752798 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 733}. Best is trial 13 with value: 0.8050947771752798. [I 2022-06-11 21:57:55,339] Trial 14 finished with value: 0.798618611006402 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 757}. Best is trial 13 with value: 0.8050947771752798. [I 2022-06-11 21:57:56,001] Trial 15 finished with value: 0.8023047624334361 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 152}. Best is trial 13 with value: 0.8050947771752798. [I 2022-06-11 21:57:57,248] Trial 16 finished with value: 0.80498076851405 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 766}. Best is trial 13 with value: 0.8050947771752798. [I 2022-06-11 21:57:58,580] Trial 17 finished with value: 0.8041300480080753 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 933}. Best is trial 13 with value: 0.8050947771752798. [I 2022-06-11 21:57:59,799] Trial 18 finished with value: 0.7757464455049321 and parameters: {'criterion': 'squared_error', 'splitter': 'random', 'max_leaf_nodes': 1491}. Best is trial 13 with value: 0.8050947771752798. [I 2022-06-11 21:58:00,448] Trial 19 finished with value: 0.8008839608226905 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 135}. Best is trial 13 with value: 0.8050947771752798.
optuna.visualization.plot_param_importances(study)
optuna.visualization.plot_optimization_history(study)
dt_params
{'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 733}
dt_params = {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 733}
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(**dt_params)
# Bundle preprocessing and modeling code in a pipeline
dt_pipe = Pipeline(steps=[
('preprocessor', preprocessor),
('model', dt_model)
])
dt_pipe.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])),
('model', DecisionTreeRegressor(max_leaf_nodes=733))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])),
('model', DecisionTreeRegressor(max_leaf_nodes=733))])ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])Index(['total_sqft', 'bath', 'bhk'], dtype='object')
SimpleImputer(strategy='constant')
Index(['location'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore', sparse=False)
DecisionTreeRegressor(max_leaf_nodes=733)
pd.Series({
"Train Score": dt_pipe.score(X_train, y_train),
"Test Score": dt_pipe.score(X_test, y_test)
})
Train Score 0.988917 Test Score 0.899828 dtype: float64
dt_cv_score = cross_val_score(dt_pipe, X_train, y_train, cv=cv)
print(
"Cross Validation Score \t\t", dt_cv_score, "\n",
"Cross Validation Mean Score \t", dt_cv_score.mean(), sep=""
)
Cross Validation Score [0.80737992 0.85251181 0.82975466 0.60047708 0.86708361] Cross Validation Mean Score 0.7914414157443508
"""We are not hypertuning RandomForestRegressor because it's going to take a long time
from sklearn.ensemble import RandomForestRegressor
def objective(trial):
rf_params = dict(
n_estimators=trial.suggest_int("n_estimators", 50, 500),
criterion=trial.suggest_categorical('criterion', ["squared_error", "absolute_error"]),
max_leaf_nodes=trial.suggest_int("max_leaf_nodes", 100, 3000)
)
rf = Pipeline(steps=[
('preprocessor', preprocessor),
('model', RandomForestRegressor(**rf_params))
])
return cross_val_score(rf, X_train, y_train, cv=cv).mean()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
rf_params = study.best_params
"""
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor()
# Bundle preprocessing and modeling code in a pipeline
rf_pipe = Pipeline(steps=[
('preprocessor', preprocessor),
('model', rf_model)
])
rf_pipe.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])),
('model', RandomForestRegressor())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])),
('model', RandomForestRegressor())])ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])Index(['total_sqft', 'bath', 'bhk'], dtype='object')
SimpleImputer(strategy='constant')
Index(['location'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore', sparse=False)
RandomForestRegressor()
pd.Series({
"Train Score": rf_pipe.score(X_train, y_train),
"Test Score": rf_pipe.score(X_test, y_test)
})
Train Score 0.962790 Test Score 0.888108 dtype: float64
rf_cv_score = cross_val_score(rf_pipe, X_train, y_train, cv=cv)
print(
"Cross Validation Score \t\t", rf_cv_score, "\n",
"Cross Validation Mean Score \t", rf_cv_score.mean(), sep=""
)
Cross Validation Score [0.77141246 0.84309946 0.85190028 0.61993243 0.89916466] Cross Validation Mean Score 0.7971018606826477
from xgboost import XGBRegressor
def objective(trial):
xgb_params = dict(
learning_rate=trial.suggest_float("learning_rate", 1e-2, 1e-1, log=True),
n_estimators=trial.suggest_int("n_estimators", 100, 500)
)
xgb = Pipeline(steps=[
('preprocessor', preprocessor),
('model', XGBRegressor(**xgb_params))
])
xgb.fit(
X_train, y_train,
model__early_stopping_rounds=100,
model__eval_set=[(eval_test, y_test)],
model__verbose=False
)
return cross_val_score(xgb, X_train, y_train, cv=cv).mean()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
xgb_params = study.best_params
C:\Users\User\anaconda3\envs\data-science\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. [I 2022-06-11 22:03:37,233] A new study created in memory with name: no-name-ccccb1da-441e-4708-8135-7051f5fb2226 [I 2022-06-11 22:03:53,622] Trial 0 finished with value: 0.7843913570955278 and parameters: {'learning_rate': 0.01826119870880335, 'n_estimators': 260}. Best is trial 0 with value: 0.7843913570955278. [I 2022-06-11 22:04:17,672] Trial 1 finished with value: 0.8089597798789938 and parameters: {'learning_rate': 0.022755442929083564, 'n_estimators': 387}. Best is trial 1 with value: 0.8089597798789938. [I 2022-06-11 22:04:43,016] Trial 2 finished with value: 0.7939356339224122 and parameters: {'learning_rate': 0.01507936473215109, 'n_estimators': 386}. Best is trial 1 with value: 0.8089597798789938. [I 2022-06-11 22:05:06,307] Trial 3 finished with value: 0.8000568025622494 and parameters: {'learning_rate': 0.01897567434963167, 'n_estimators': 358}. Best is trial 1 with value: 0.8089597798789938. [I 2022-06-11 22:05:24,091] Trial 4 finished with value: 0.8199828063157588 and parameters: {'learning_rate': 0.04828024435485572, 'n_estimators': 274}. Best is trial 4 with value: 0.8199828063157588. [I 2022-06-11 22:05:32,821] Trial 5 finished with value: 0.7678444922582894 and parameters: {'learning_rate': 0.02819118930965911, 'n_estimators': 126}. Best is trial 4 with value: 0.8199828063157588. [I 2022-06-11 22:05:40,562] Trial 6 finished with value: 0.7101888081996007 and parameters: {'learning_rate': 0.01904233173440493, 'n_estimators': 109}. Best is trial 4 with value: 0.8199828063157588. [I 2022-06-11 22:05:57,002] Trial 7 finished with value: 0.8371117497873091 and parameters: {'learning_rate': 0.09710329326224534, 'n_estimators': 253}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:06:24,256] Trial 8 finished with value: 0.8228104378608971 and parameters: {'learning_rate': 0.035197614252757095, 'n_estimators': 428}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:06:55,735] Trial 9 finished with value: 0.8059293336732521 and parameters: {'learning_rate': 0.016226617718595095, 'n_estimators': 489}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:07:08,898] Trial 10 finished with value: 0.8330067893954605 and parameters: {'learning_rate': 0.09691256210385771, 'n_estimators': 204}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:07:21,512] Trial 11 finished with value: 0.8268931534729113 and parameters: {'learning_rate': 0.09823212106053601, 'n_estimators': 191}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:07:34,172] Trial 12 finished with value: 0.8283738175908549 and parameters: {'learning_rate': 0.09947709602579106, 'n_estimators': 200}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:07:48,509] Trial 13 finished with value: 0.8212699276036867 and parameters: {'learning_rate': 0.06312460235460259, 'n_estimators': 217}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:08:08,475] Trial 14 finished with value: 0.8302348469581364 and parameters: {'learning_rate': 0.06772987781045565, 'n_estimators': 306}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:08:19,100] Trial 15 finished with value: 0.6546668604234721 and parameters: {'learning_rate': 0.01093480355775935, 'n_estimators': 152}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:08:39,295] Trial 16 finished with value: 0.8324266153833081 and parameters: {'learning_rate': 0.07314122632985683, 'n_estimators': 314}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:08:55,020] Trial 17 finished with value: 0.817071851028552 and parameters: {'learning_rate': 0.04847666310928432, 'n_estimators': 241}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:09:06,755] Trial 18 finished with value: 0.8062879333357265 and parameters: {'learning_rate': 0.04834862855631613, 'n_estimators': 168}. Best is trial 7 with value: 0.8371117497873091. [I 2022-06-11 22:09:27,891] Trial 19 finished with value: 0.832840296714345 and parameters: {'learning_rate': 0.07806085341112905, 'n_estimators': 325}. Best is trial 7 with value: 0.8371117497873091.
optuna.visualization.plot_param_importances(study)
optuna.visualization.plot_optimization_history(study)
xgb_params
{'learning_rate': 0.09666399416904463, 'n_estimators': 364}
xgb_params = {'learning_rate': 0.09666399416904463, 'n_estimators': 364}
from xgboost import XGBRegressor
xgb_model = XGBRegressor(**xgb_params)
# Bundle preprocessing and modeling code in a pipeline
xgb_pipe = Pipeline(steps=[
('preprocessor', preprocessor),
('model', xgb_model)
])
xgb_pipe.fit(
X_train, y_train,
model__early_stopping_rounds=100,
model__eval_set=[(eval_test, y_test)],
model__verbose=False
)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])),
('model',
XGBRe...
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='',
learning_rate=0.09666399416904463,
max_delta_step=0, max_depth=6, min_child_weight=1,
missing=nan, monotone_constraints='()',
n_estimators=364, n_jobs=8, num_parallel_tree=1,
predictor='auto', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1,
verbosity=None))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])),
('model',
XGBRe...
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='',
learning_rate=0.09666399416904463,
max_delta_step=0, max_depth=6, min_child_weight=1,
missing=nan, monotone_constraints='()',
n_estimators=364, n_jobs=8, num_parallel_tree=1,
predictor='auto', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1,
verbosity=None))])ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore',
sparse=False))]),
Index(['location'], dtype='object'))])Index(['total_sqft', 'bath', 'bhk'], dtype='object')
SimpleImputer(strategy='constant')
Index(['location'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore', sparse=False)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.09666399416904463,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=364, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)pd.Series({
"Train Score": xgb_pipe.score(X_train, y_train),
"Test Score": xgb_pipe.score(X_test, y_test)
})
Train Score 0.974757 Test Score 0.934992 dtype: float64
xgb_cv_score = cross_val_score(xgb_pipe, X_train, y_train, cv=cv)
print(
"Cross Validation Score \t\t", xgb_cv_score, "\n",
"Cross Validation Mean Score \t", xgb_cv_score.mean(), sep=""
)
Cross Validation Score [0.88023795 0.88931915 0.89865116 0.62436034 0.91485851] Cross Validation Mean Score 0.8414854226237235
scores = []
models = [lr_pipe, dt_pipe, rf_pipe, xgb_pipe]
for model in models:
scores.append({
'Model': str(model.named_steps.model).split('(')[0],
'r2 Score': r2_score(y_test, model.predict(X_test)),
'MAE Score': mean_absolute_error(y_test, model.predict(X_test)),
'MSE Score': mean_squared_error(y_test, model.predict(X_test))
})
pd.DataFrame(scores).set_index('Model')
| r2 Score | MAE Score | MSE Score | |
|---|---|---|---|
| Model | |||
| LinearRegression | 0.790460 | 16.634335 | 1979.243270 |
| DecisionTreeRegressor | 0.899828 | 16.213864 | 946.188374 |
| RandomForestRegressor | 0.888108 | 15.006293 | 1056.890782 |
| XGBRegressor | 0.934992 | 13.591667 | 614.045627 |
with open("model.pkl", "wb") as f:
pickle.dump(xgb_pipe, f)